Data import

## Ambulance Data related to acute cerebrovascular accidents
brain_data <- read_csv("data/raw/brain_data.csv") %>%
   mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level), as.factor))
## Ambulance Long Data related to acute cerebrovascular accidents
brain_data_long <- read_csv("data/raw/brain_data_long.csv") %>%
    mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level, Sex, Age), as.factor))

## Ambulance Data related to cardiovascular diseases
heart_data <- read_csv("data/raw/heart_data.csv") %>%
   mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level), as.factor))
## Ambulance Long Data  related to unstable angina
UA_data_long <- read_csv("data/raw/UA_data_long.csv") %>%
   mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level, Sex, Age), as.factor))
## Ambulance Long Data related to acute myocardial infarction
AMI_data_long <- read_csv("data/raw/AMI_data_long.csv") %>%
   mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level, Sex, Age), as.factor))

Correlation of qualitative variables

# Data preprocessing
brain_data_clear <- brain_data %>% 
  select( where(is.numeric) & !ends_with(c("min", "max", "var", "K_Sum")) ) %>%
  filter(complete.cases(.))

# Function for displaying correlations with a neutral color for values close to 0
color_correlation_fixed <- function(data, mapping, ...) {
  
  x <- eval_data_col(data, mapping$x)
  y <- eval_data_col(data, mapping$y)
  cor_value <- cor(x, y, use = "complete.obs")
  
  # Defining color
  color <- if (abs(cor_value) < 0.1) {
    scales::alpha("gray50", 0.5)           ## Gray for weak correlations
  } else if (cor_value > 0) {
    scales::alpha("blue", abs(cor_value))  ## Blue for positive correlations
  } else {
    scales::alpha("red", abs(cor_value))   ## Red for negative correlations
  }
  
  # Filter for small correlation values
  cor_label <- ifelse(abs(cor_value) < 0.01, "0", sprintf("%.2f", cor_value))
  
  ggplot(data.frame()) + 
    annotate("text", x = 0.5, y = 0.5, label = cor_label, size = 10, color = color ) +
    theme_void() 
}

# Function for trend line with a golden color
golden_smooth <- function(data, mapping, ...) {
  ggplot(data, mapping) +
    geom_point(alpha = 0.4, size = 0.5, color = "black") + 
    geom_smooth(color = "gold", ...) + 
    theme_custom
}

# Plot construction
ggpairs(
  brain_data_clear, 
  progress = FALSE,
  upper = list(continuous = color_correlation_fixed), ## Highlighting correlations
  lower = list(continuous = golden_smooth) )+         ## Golden trend line
theme_custom +
  theme(
    axis.text.x = element_text(size = 10),  
    axis.text.y = element_text(size = 10) )   

cor_data <- brain_data %>% 
  select( where(is.numeric) & !ends_with(c("min", "max", "Sum", "var")) ) %>%
  rename(Temperature = Temp_mean, Precipitation = H2O, 
         `F10.7 index` = F10.7, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>% 
  cor(use = "pairwise.complete.obs") 
  
corrplot(cor_data, method = 'number', type = 'lower', diag = FALSE)

network_plot(cor_data, min_cor = .1)

When analyzing the network graph, three groups of features can be identified:

Additionally, some features within the groups exhibit a high degree of correlation!

brain_data %>% 
  drop_na() %>% 
  ggplot(aes(Temp_mean, Pressure))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
  labs(x = "Temperature")+
  theme_custom

brain_data %>% 
  ggplot(aes(Sunspots, F10.7))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
  labs(x = "Sunspot Number", y = "F10.7 index")+
  theme_custom

brain_data %>% 
  ggplot(aes(Dst_mean, Ap_mean))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
  labs(x = "Daily Average Dst index", y = "Daily Average Ap index")+
  theme_custom

Based on further analysis, the following variables will be excluded:

Solar cycles

brain_data %>% 
  select(!c(Pressure, F10.7, Ap_mean)) %>% 
  ggplot(aes(Date, Sunspots))+
  geom_line(alpha = 0.3, colour = "royalblue2")+
  geom_smooth(method = "gam", formula = y ~ s(x, k = 10),  se = FALSE,
              colour = "orangered4", linewidth = 2) +
  geom_vline(xintercept = as.Date("2009-01-01"), 
             colour = "orangered", linewidth = 1, linetype = "dashed")+
  geom_vline(xintercept = as.Date("2019-12-31"), 
             colour = "orangered", linewidth = 1, linetype = "dashed")+
  scale_x_date(date_breaks = "2 year", date_labels = "%Y")+
  labs(x = "", y = "Sunspot number")+
  theme_custom

The number of sunspots is directly linked to the solar activity cycles, as confirmed by the graph. For further analysis, it is advisable to focus separately on the period from 2009 to 2019, which covers the complete 24th solar activity cycle.

Data Decomposition

Decomposition allows us to identify trends and cyclicity:

ggarrange(

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Sunspots) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Sunspots"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(EMS) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("EMS"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Temp_mean) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Temperature"), 

  nrow = 1
)

ggarrange(

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Sunspots) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Sunspots"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Ap_mean) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Ap Index"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Dst_var) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Dst amplitude"), 
  
  nrow = 1
)

Visualizing Association

Dst ~ Sunspots

brain_data %>%
  filter(Dst_level != "Severe") %>%
  mutate(Dst_level = fct_relevel(Dst_level, c("Quiet", "Weak", "Moderate", "Major", "Severe"))) %>% 
  ggplot(aes(Dst_level, Sunspots))+
  geom_jitter(alpha = 0.5, size = 1, width = 0.2) +  
  geom_boxplot(alpha = 0.5, outliers = FALSE, fill = "goldenrod2", linewidth = 1)+
  labs( x = "Geomagnetic activity level",  y = "Sunspot Number" ) +
  theme_custom

DayOff ~ EMS

mean_values <- brain_data %>%
  group_by(DayOff) %>%
  summarise(mean_EMS = round(mean(EMS, na.rm = TRUE), 2), .groups = "drop")

ggplot(brain_data, aes(x = DayOff, y = EMS)) +
  geom_boxplot(aes(fill = DayOff)) +
  geom_text(data = mean_values, 
            aes(x = DayOff, y = mean_EMS, label = mean_EMS), 
            color = "black", size = 8, fontface = "bold", vjust = -1) +
  scale_fill_brewer(palette = "Dark2", direction = -1)+
  scale_y_continuous(limits = c(NA, max(brain_data$EMS) + 3))+
  labs(y = "Number of Emergency Medical Calls", x = "Non-working Days")+
  theme_custom +
  theme(legend.position = "none")+
  stat_pvalue_manual(t_test(data = brain_data,  EMS ~ DayOff),
                     label = "T-test, p = {p}", 
                     size = 10, 
                     y.position = max(brain_data$EMS) + 0.1)

  • The difference between the average EMS values for weekdays and weekends is statistically significant.
  • The average EMS value is higher on weekdays (13.33) compared to weekends (11.02).

Season ~ EMS

brain_data %>% 
mutate(Season = fct_relevel(Season, c("Winter", "Spring", "Summer", "Autumn"))) %>% 
ggplot(aes(x = Season, y = EMS, fill = Season)) +
  geom_boxplot(alpha = 0.7, show.legend = FALSE) +
  scale_fill_manual(
    values = c("Winter" = "dodgerblue2", "Spring" = "palegreen2", "Summer" = "tan2","Autumn" = "tomato2")) +
  labs(y = "Number of Emergency Medical Calls", x = "")+
  theme_custom 

brain_data %>%
  pairwise_t_test(EMS ~ Season, p.adjust.method = "holm") %>% 
  transmute(`Group 1` = group1, `Group 2` = group2, 
            `p-value` = p, `p.adjusted` = p.adj) %>%
  flextable() 

Group 1

Group 2

p-value

p.adjusted

Autumn

Spring

0.0162

0.0969

Autumn

Summer

0.1300

0.5200

Spring

Summer

0.3670

1.0000

Autumn

Winter

0.0259

0.1290

Spring

Winter

0.8650

1.0000

Summer

Winter

0.4670

1.0000

The distribution of emergency medical service calls across seasons is statistically insignificant (paired t-test with Holm p-value adjustment method: p.adjusted > 0.05).

Geomagnetic indices

brain_data %>% 
  drop_na(Kp_Sum, K_Sum) %>% 
  ggplot(aes(Kp_Sum, K_Sum))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
  labs(x = "Daily Average Kp index", y = "Daily Average K index (IRT)")+
  theme_custom

brain_data %>%
  drop_na(Kp_Sum, K_Sum) %>% 
  pivot_longer(cols = c(Kp_Sum, K_Sum), names_to = "index") %>% 
  ggplot() +
  geom_histogram(aes(x = value, fill = index), 
                 binwidth = 5, colour = "black", alpha = 0.5, position = "identity", show.legend = FALSE) +
  scale_fill_manual(
    values = c("K_Sum" = "darkseagreen2", "Kp_Sum" = "steelblue2") )+
  scale_x_continuous(breaks = seq(0, 50, 10))+
  labs( x = "Index Values", y = "Observation Count", fill = "Index Type") +
  theme_custom +
  facet_wrap(~index, labeller = labeller(index = c("K_Sum" = "Daily Average K index (IRT)",
                                                   "Kp_Sum" = "Daily Average Kp index")))

Daily Average Kp index and Daily Average K index (IRT) show a strong correlation (r = 0.91). However, K index (IRT) has a smaller range and a more uniform distribution compared to Kp index.

Age & Sex ~ EMS

ggarrange(

brain_data_long %>%
  filter(between(as.numeric(as.character(Year)), 2007, 2021), Age != "55+") %>% 
  group_by(Year, Sex, Age) %>%
  summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
  mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>% 
  ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
  geom_bar(stat = "identity",  alpha = 0.7, colour = "black", show.legend = FALSE) +
  scale_y_continuous(labels = abs, n.breaks = 10) + 
  scale_fill_manual(
    values = c("Male" = "steelblue", "Female" = "pink"),
    labels = c("Male" = "Male", "Female" = "Female") ) +
  coord_flip() +
  labs(
    title = "Ambulance Calls Distribution by Year, Age and Gender",
    x = "", y = "", fill = "Gender") +
  theme_custom + 
  facet_wrap(. ~ Age,  scales = "free_x", 
              labeller = labeller(Age = c("25-44" = "Age: 25-44 years", 
                                          "45-54" = "Age: 45-54 years"))), 

brain_data_long %>%
  filter(between(as.numeric(as.character(Year)), 2007, 2021), Age == "55+") %>% 
  group_by(Year, Sex, Age) %>%
  summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
  mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>% 
  ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
  geom_bar(stat = "identity",  alpha = 0.7, colour = "black", ) +
  scale_y_continuous(labels = abs, n.breaks = 10) + 
  scale_fill_manual(
    values = c("Male" = "steelblue", "Female" = "pink"),
    labels = c("Male" = "Male", "Female" = "Female") ) +
  coord_flip() +
  labs(
    x = "", y = "Total EMS", fill = "Gender") +
  theme_custom + theme(legend.position = "inside", legend.justification = c(0.9, 0.2))+
  facet_wrap(. ~ Age,  scales = "free_x", 
              labeller = labeller(Age = c("55+" = "Age: 55+ years"))), 

nrow = 2

)